library(data.table)
## Warning: package 'data.table' was built under R version 4.2.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(mlr3)
## Warning: package 'mlr3' was built under R version 4.2.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.3
library(ranger)
## Warning: package 'ranger' was built under R version 4.2.3
library(class)
## Warning: package 'class' was built under R version 4.2.3
library(rmarkdown)
## Warning: package 'rmarkdown' was built under R version 4.2.3
str(train_data)
## Classes 'data.table' and 'data.frame': 10486 obs. of 17 variables:
## $ V1 : num 4271 4290 4269 4278 4305 ...
## $ V2 : num 4268 4247 4268 4242 4278 ...
## $ V3 : num 4024 3943 4002 3974 3991 ...
## $ V4 : num 4634 4625 4620 4589 4613 ...
## $ V5 : num 4124 4110 4119 4095 4131 ...
## $ V6 : num 4349 4324 4342 4315 4342 ...
## $ V7 : num 4220 4239 4218 4227 4234 ...
## $ V8 : num 4100 4056 4060 4046 4079 ...
## $ V9 : num 4209 4203 4208 4182 4211 ...
## $ V10: num 4628 4605 4634 4602 4632 ...
## $ V11: num 4192 4203 4196 4196 4207 ...
## $ V12: num 4218 4230 4249 4223 4231 ...
## $ V13: num 4290 4276 4267 4264 4280 ...
## $ V14: num 4236 4165 4095 4085 4179 ...
## $ V15: num 4567 4603 4586 4590 4619 ...
## $ V16: num 4340 4340 4340 4343 4354 ...
## $ V17: int 1 1 1 1 2 1 2 2 2 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
paged_table(head(train_data))
paged_table(head(test_data))
summary(train_data)
## V1 V2 V3 V4 V5
## Min. :1045 Min. :1047 Min. :2831 Min. : 2762 Min. : 2453
## 1st Qu.:4280 1st Qu.:4251 1st Qu.:3991 1st Qu.: 4610 1st Qu.: 4108
## Median :4294 Median :4262 Median :4006 Median : 4618 Median : 4120
## Mean :4301 Mean :4264 Mean :4010 Mean : 4654 Mean : 4183
## 3rd Qu.:4311 3rd Qu.:4272 3rd Qu.:4024 3rd Qu.: 4627 3rd Qu.: 4132
## Max. :7402 Max. :5768 Max. :7804 Max. :362561 Max. :642572
## NA's :15
## V6 V7 V8 V9 V10
## Min. :2085 Min. :1816 Min. : 3584 Min. :1353 Min. :4555
## 1st Qu.:4331 1st Qu.:4220 1st Qu.: 4058 1st Qu.:4190 1st Qu.:4604
## Median :4339 Median :4229 Median : 4070 Median :4200 Median :4614
## Mean :4342 Mean :4231 Mean : 4127 Mean :4201 Mean :4616
## 3rd Qu.:4348 3rd Qu.:4240 3rd Qu.: 4084 3rd Qu.:4210 3rd Qu.:4625
## Max. :6472 Max. :6683 Max. :567191 Max. :7148 Max. :7259
## NA's :6
## V11 V12 V13 V14 V15
## Min. :3282 Min. :1816 Min. :3095 Min. :4057 Min. : 79.93
## 1st Qu.:4189 1st Qu.:4220 1st Qu.:4267 1st Qu.:4112 1st Qu.:4589.85
## Median :4201 Median :4229 Median :4277 Median :4153 Median :4603.20
## Mean :4202 Mean :4231 Mean :4279 Mean :4157 Mean :4604.96
## 3rd Qu.:4212 3rd Qu.:4240 3rd Qu.:4288 3rd Qu.:4205 3rd Qu.:4618.12
## Max. :6142 Max. :6683 Max. :7007 Max. :4255 Max. :4831.53
## NA's :10319
## V16 V17
## Min. : 1357 Min. :1.000
## 1st Qu.: 4341 1st Qu.:1.000
## Median : 4355 Median :1.000
## Mean : 4440 Mean :1.449
## 3rd Qu.: 4373 3rd Qu.:2.000
## Max. :715901 Max. :2.000
##
summary(test_data)
## V1 V2 V3 V4 V5
## Min. : 4199 Min. :4197 Min. :3906 Min. :4002 Min. :4058
## 1st Qu.: 4281 1st Qu.:4250 1st Qu.:3990 1st Qu.:4611 1st Qu.:4108
## Median : 4294 Median :4263 Median :4005 Median :4618 Median :4121
## Mean : 4370 Mean :4265 Mean :4009 Mean :4620 Mean :4122
## 3rd Qu.: 4312 3rd Qu.:4271 3rd Qu.:4023 3rd Qu.:4626 3rd Qu.:4131
## Max. :309231 Max. :6881 Max. :5501 Max. :4757 Max. :5416
## V6 V7 V8 V9 V10
## Min. :4309 Min. :3915 Min. :2086 Min. : 4148 Min. :4572
## 1st Qu.:4332 1st Qu.:4221 1st Qu.:4058 1st Qu.: 4191 1st Qu.:4605
## Median :4339 Median :4230 Median :4070 Median : 4200 Median :4614
## Mean :4342 Mean :4231 Mean :4073 Mean : 4260 Mean :4616
## 3rd Qu.:4347 3rd Qu.:4240 3rd Qu.:4084 3rd Qu.: 4209 3rd Qu.:4624
## Max. :5455 Max. :4363 Max. :4178 Max. :265641 Max. :4770
## V11 V12 V13 V14 V15
## Min. :4106 Min. :3915 Min. :2258 Min. :4000 Min. : 4450
## 1st Qu.:4190 1st Qu.:4221 1st Qu.:4268 1st Qu.:4050 1st Qu.: 4591
## Median :4201 Median :4230 Median :4277 Median :4099 Median : 4603
## Mean :4203 Mean :4231 Mean :4279 Mean :4100 Mean : 4639
## 3rd Qu.:4211 3rd Qu.:4240 3rd Qu.:4288 3rd Qu.:4149 3rd Qu.: 4618
## Max. :6823 Max. :4363 Max. :4395 Max. :4200 Max. :152308
## V16 V17
## Min. :4212 Min. :1.000
## 1st Qu.:4342 1st Qu.:1.000
## Median :4355 Median :1.000
## Mean :4362 Mean :1.448
## 3rd Qu.:4373 3rd Qu.:2.000
## Max. :5023 Max. :2.000
colSums(is.na(train_data))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 0 0 0 0 15 0 0 6 0 0 0 0 0
## V14 V15 V16 V17
## 10319 0 0 0
column_types <- sapply(train_data, class)
column_types
## V1 V2 V3 V4 V5 V6 V7 V8
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## V9 V10 V11 V12 V13 V14 V15 V16
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## V17
## "integer"
We can see the structure of the datasets and I called summary and I also check for types of data in each column so I can have better understanding of the data. I can see that in the trainset column V14 has 10319 NA’s, meaning that this column is useless. Some columns have very big max values. Now I will plot the data and see if I get some interesting insights
plots1 <- list()
for (i in colnames(train_data[, .(V1,V2,V3,V4,V5,V6)])) {
plots1[[i]] <- plot_ly(data = train_data, x = 1:nrow(train_data), y = train_data[,get(i)], color =train_data[,V17],
type = "scatter", mode = "markers")
}
plots2 <- list()
for (i in colnames(train_data[, .(V7,V8,V9,V10,V11,V12)])) {
plots2[[i]] <- plot_ly(data = train_data, x = 1:nrow(train_data), y = train_data[,get(i)], color =train_data[,V17],
type = "scatter", mode = "markers")
}
plots3 <- list()
for (i in colnames(train_data[, .(V13,V14,V15,V16)])) {
plots3[[i]] <- plot_ly(data = train_data, x = 1:nrow(train_data), y = train_data[,get(i)], color =train_data[,V17],
type = "scatter", mode = "markers")
}
subplot(plots1, nrows =3 , titleY = T, margin = 0.05)
## Warning: Ignoring 15 observations
subplot(plots2, nrows =3 , titleY = T, margin = 0.05)
## Warning: Ignoring 6 observations
subplot(plots3, nrows =2 , titleY = T, margin = 0.05)
## Warning: Ignoring 10319 observations
On these plots we can see that we have few outlayers in each variable. These outliers can possibly be errors or anomalies. In the V4,V5,V8,V16 it is clear that the outliers must be error because its value is more than 300 000. On the others it is possible that the outliers are just anomalies. I will remove the error outliers in data cleaning. We can also see that in V14 there is significantly less values because there is more than 10000 na’s in V14 column. I will need to do something with this column in data cleaning.
print(paste("number of observations in training data:",nrow(train_data)))
## [1] "number of observations in training data: 10486"
print(paste("number of NA's in train data:",sum(is.na(train_data))))
## [1] "number of NA's in train data: 10340"
print(paste("number of observations in test data:",nrow(test_data)))
## [1] "number of observations in test data: 4494"
print(paste("number of NA's in test data:",sum(is.na(test_data))))
## [1] "number of NA's in test data: 0"
sum(is.na(train_data[,V14]))
## [1] 10319
train_data[, V14 := NULL]
test_data[,V14:=NULL]
print(paste("number of na's in training data after deleting V14 column:",sum(is.na(train_data))))
## [1] "number of na's in training data after deleting V14 column: 21"
na.omit(train_data)
## V1 V2 V3 V4 V5 V6 V7 V8
## <num> <num> <num> <num> <num> <num> <num> <num>
## 1: 4271.243 4268.296 4023.855 4633.701 4123.519 4349.317 4219.938 4099.909
## 2: 4289.559 4246.557 3942.566 4625.247 4109.779 4324.064 4238.694 4055.565
## 3: 4269.343 4268.123 4002.423 4619.655 4119.264 4341.975 4218.083 4059.838
## 4: 4277.774 4241.873 3974.092 4588.951 4094.744 4315.371 4226.871 4045.703
## 5: 4305.415 4278.254 3990.935 4612.909 4131.057 4341.833 4234.395 4078.737
## ---
## 10461: 4278.239 4236.561 3984.617 4602.907 4106.490 4331.174 4224.743 4061.523
## 10462: 4242.714 4250.394 4001.537 4614.621 4114.467 4332.298 4230.501 4052.240
## 10463: 4283.146 4220.684 3959.518 4616.671 4087.848 4328.896 4240.378 4070.796
## 10464: 4302.162 4277.361 3998.248 4616.635 4116.586 4351.271 4227.369 4094.908
## 10465: 4342.272 4250.866 3984.202 4618.894 4118.444 4365.235 4256.441 4074.424
## V9 V10 V11 V12 V13 V15 V16 V17
## <num> <num> <num> <num> <num> <num> <num> <int>
## 1: 4208.919 4628.077 4191.718 4217.955 4289.988 4566.711 4340.008 1
## 2: 4203.122 4605.427 4203.027 4230.053 4275.575 4603.046 4339.954 1
## 3: 4207.642 4633.651 4195.585 4249.387 4266.658 4585.675 4340.059 1
## 4: 4181.712 4602.162 4196.075 4223.062 4263.628 4590.401 4342.895 1
## 5: 4210.574 4631.565 4206.704 4230.628 4279.925 4619.407 4354.185 2
## ---
## 10461: 4209.730 4622.018 4197.167 4230.838 4269.357 4588.413 4339.385 2
## 10462: 4180.039 4605.330 4172.590 4213.924 4246.346 4565.201 4296.446 1
## 10463: 4203.843 4614.622 4194.042 4233.497 4272.198 4581.772 4336.622 2
## 10464: 4201.546 4616.036 4213.862 4256.186 4280.359 4601.565 4374.190 2
## 10465: 4211.373 4603.222 4222.590 4251.372 4292.710 4631.441 4389.726 2
We can see that there are around 10500 total observations. There is total of 10340 NA’s in training data set. I already know that most of the NA’s (10319) are in the V14 column so i will just delete it. After deleting the column there is only 21 NA’s that I can omit. There are no NA’s in test data set
train_number_of_observations <- train_data[,.N]
test_number_of_observations <- test_data[,.N]
train_number_of_observations <- train_data[,.N]
test_number_of_observations <- test_data[,.N]
total_observations <- train_number_of_observations + test_number_of_observations
distribution_percentage_train <- train_number_of_observations / total_observations * 100
distribution_percentage_test <- test_number_of_observations / total_observations * 100
distribution_percentage_train
## [1] 70
distribution_percentage_test
## [1] 30
I calculated the train test split. The data are divided with 70/30 train test split.
for (col in names(train_data[, !"V17"])) {
train_data <- train_data[!(get(col) > 50000)]
}
colnames(train_data)[16] <- "eyes_status"
colnames(test_data)[16] <- "eyes_status"
train_x <- train_data[,!"eyes_status"]
train_y <- train_data[,eyes_status]
test_x <- test_data[,!"eyes_status"]
test_y <- test_data[,eyes_status]
test_y <- factor(test_y, levels = c(1, 2), labels = c("1", "2"))
I removed rows with outliers that are likely to be error in measurement. I changed the name of column with closed and opened eyes, now the data table makes more sense. Separating independent variables and target variable form both training and test datasets. This prepares them for use in evaluating of clasification models. I am also setting test_y as factor with levels 1 and 2 for creating confusion matrixes later.
dtree_default <- rpart::rpart(eyes_status ~ ., method = "class", data = train_data)
printcp(dtree_default)
##
## Classification tree:
## rpart::rpart(formula = eyes_status ~ ., data = train_data, method = "class")
##
## Variables actually used in tree construction:
## [1] V13 V15 V16 V3 V4 V8
##
## Root node error: 4701/10463 = 0.4493
##
## n= 10463
##
## CP nsplit rel error xerror xstd
## 1 0.098064 0 1.00000 1.00000 0.010823
## 2 0.057009 1 0.90194 0.92129 0.010717
## 3 0.032121 2 0.84493 0.85110 0.010574
## 4 0.028079 3 0.81281 0.83110 0.010525
## 5 0.019783 5 0.75665 0.76686 0.010340
## 6 0.014465 7 0.71708 0.73984 0.010250
## 7 0.013827 8 0.70262 0.73452 0.010231
## 8 0.012551 9 0.68879 0.72772 0.010207
## 9 0.010000 10 0.67624 0.70921 0.010139
plotcp(dtree_default)
rpart.plot(dtree_default, type = 2, extra = 101, fallen.leaves = F, main = "Classification Tree for eye status", tweak=1.2)
First default decision tree doesn’t perform well as we can see the xerror on level 9 is 0.72304. The decrease in the complexity parameter values with each split suggests that pruning the tree could improve its performance. In nine levels of CP I don’t see any overfitting meaning there is space for imporvement of the DecTree. Next I can grow the full tree and see if there will be overfitting. If yes then I can prune it afterwords.
dtree_full <- rpart::rpart(eyes_status ~ ., method = "class", data = train_data,
control = rpart.control(minsplit = 1, cp = 0))
printcp(dtree_full)
##
## Classification tree:
## rpart::rpart(formula = eyes_status ~ ., data = train_data, method = "class",
## control = rpart.control(minsplit = 1, cp = 0))
##
## Variables actually used in tree construction:
## [1] V1 V10 V11 V12 V13 V15 V16 V2 V3 V4 V5 V6 V7 V8 V9
##
## Root node error: 4701/10463 = 0.4493
##
## n= 10463
##
## CP nsplit rel error xerror xstd
## 1 0.09806424 0 1.000000 1.00000 0.0108234
## 2 0.05700915 1 0.901936 0.92023 0.0107153
## 3 0.03212083 2 0.844927 0.84876 0.0105687
## 4 0.02807913 3 0.812806 0.80664 0.0104595
## 5 0.01978302 5 0.756648 0.76431 0.0103321
## 6 0.01446501 7 0.717081 0.72814 0.0102087
## 7 0.01382685 8 0.702616 0.72155 0.0101847
## 8 0.01255052 9 0.688790 0.72049 0.0101808
## 9 0.00829611 10 0.676239 0.69730 0.0100925
## 10 0.00723250 11 0.667943 0.67560 0.0100045
## 11 0.00531802 13 0.653478 0.67305 0.0099938
## 12 0.00489258 16 0.637524 0.65667 0.0099234
## 13 0.00416933 18 0.627739 0.65369 0.0099103
## 14 0.00404169 28 0.568815 0.64327 0.0098635
## 15 0.00393533 29 0.564773 0.64199 0.0098577
## 16 0.00361625 31 0.556903 0.64050 0.0098508
## 17 0.00333262 32 0.553287 0.63178 0.0098104
## 18 0.00276537 36 0.535418 0.62263 0.0097671
## 19 0.00262356 38 0.529887 0.61072 0.0097090
## 20 0.00255265 41 0.522017 0.60987 0.0097048
## 21 0.00233993 42 0.519464 0.60200 0.0096655
## 22 0.00219811 45 0.512444 0.59689 0.0096395
## 23 0.00212721 48 0.505850 0.59519 0.0096308
## 24 0.00191449 52 0.497341 0.59264 0.0096176
## 25 0.00180813 59 0.482238 0.58668 0.0095866
## 26 0.00170177 61 0.478622 0.59094 0.0096088
## 27 0.00159541 62 0.476920 0.58030 0.0095529
## 28 0.00154223 69 0.464795 0.58030 0.0095529
## 29 0.00148904 73 0.458626 0.57903 0.0095461
## 30 0.00138268 75 0.455648 0.57924 0.0095472
## 31 0.00134723 82 0.445437 0.57371 0.0095175
## 32 0.00127632 85 0.441395 0.57456 0.0095221
## 33 0.00122314 89 0.436290 0.56882 0.0094908
## 34 0.00116996 99 0.420974 0.56882 0.0094908
## 35 0.00106360 105 0.413954 0.56414 0.0094650
## 36 0.00101042 119 0.399064 0.56265 0.0094568
## 37 0.00095724 124 0.393321 0.56371 0.0094627
## 38 0.00092179 142 0.375239 0.56520 0.0094709
## 39 0.00085088 150 0.367581 0.56520 0.0094709
## 40 0.00077998 183 0.338864 0.56286 0.0094579
## 41 0.00074452 193 0.330568 0.56265 0.0094568
## 42 0.00070907 207 0.320145 0.56265 0.0094568
## 43 0.00063816 210 0.318017 0.56137 0.0094497
## 44 0.00056726 272 0.278026 0.56265 0.0094568
## 45 0.00053180 280 0.273133 0.56562 0.0094733
## 46 0.00049635 301 0.261859 0.56626 0.0094768
## 47 0.00047862 319 0.250160 0.56626 0.0094768
## 48 0.00042544 328 0.245692 0.56328 0.0094603
## 49 0.00037226 476 0.181876 0.56477 0.0094686
## 50 0.00035453 488 0.176345 0.56562 0.0094733
## 51 0.00031908 515 0.166348 0.56839 0.0094885
## 52 0.00028363 614 0.132950 0.57116 0.0095036
## 53 0.00027350 639 0.125292 0.57413 0.0095198
## 54 0.00026590 646 0.123378 0.57435 0.0095209
## 55 0.00024817 650 0.122314 0.57435 0.0095209
## 56 0.00021272 664 0.118273 0.59179 0.0096132
## 57 0.00018613 1077 0.029994 0.59179 0.0096132
## 58 0.00017018 1085 0.028505 0.59221 0.0096154
## 59 0.00015954 1098 0.026165 0.59434 0.0096264
## 60 0.00014181 1104 0.025101 0.59689 0.0096395
## 61 0.00012763 1128 0.021698 0.59775 0.0096439
## 62 0.00012155 1138 0.020421 0.59838 0.0096471
## 63 0.00010636 1145 0.019570 0.60753 0.0096932
## 64 0.00000000 1328 0.000000 0.60753 0.0096932
plotcp(dtree_full)
rpart.plot(dtree_full, type = 2, extra = 101, fallen.leaves = F, tweak = 1.2, main = "Entire Tree for eye status")
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
best_value <- which.min(dtree_full$cptable[, "xerror"])
Here I created full decision tree. We can see that the full tree is very complex 64 levels of CP and the tree plot is unreadable. I will find the best value of xerror and then prune the tree. Best value of xerror is 0.53818 and it’s on level 48. That means we need to prune the tree on level 48.
best_cp_for_pruning <- dtree_full$cptable[best_value, "CP"]
dtree_pruned <- prune(dtree_full, cp = best_cp_for_pruning)
printcp(dtree_pruned)
##
## Classification tree:
## rpart::rpart(formula = eyes_status ~ ., data = train_data, method = "class",
## control = rpart.control(minsplit = 1, cp = 0))
##
## Variables actually used in tree construction:
## [1] V1 V10 V11 V12 V13 V15 V16 V2 V3 V4 V5 V6 V7 V8 V9
##
## Root node error: 4701/10463 = 0.4493
##
## n= 10463
##
## CP nsplit rel error xerror xstd
## 1 0.09806424 0 1.00000 1.00000 0.0108234
## 2 0.05700915 1 0.90194 0.92023 0.0107153
## 3 0.03212083 2 0.84493 0.84876 0.0105687
## 4 0.02807913 3 0.81281 0.80664 0.0104595
## 5 0.01978302 5 0.75665 0.76431 0.0103321
## 6 0.01446501 7 0.71708 0.72814 0.0102087
## 7 0.01382685 8 0.70262 0.72155 0.0101847
## 8 0.01255052 9 0.68879 0.72049 0.0101808
## 9 0.00829611 10 0.67624 0.69730 0.0100925
## 10 0.00723250 11 0.66794 0.67560 0.0100045
## 11 0.00531802 13 0.65348 0.67305 0.0099938
## 12 0.00489258 16 0.63752 0.65667 0.0099234
## 13 0.00416933 18 0.62774 0.65369 0.0099103
## 14 0.00404169 28 0.56882 0.64327 0.0098635
## 15 0.00393533 29 0.56477 0.64199 0.0098577
## 16 0.00361625 31 0.55690 0.64050 0.0098508
## 17 0.00333262 32 0.55329 0.63178 0.0098104
## 18 0.00276537 36 0.53542 0.62263 0.0097671
## 19 0.00262356 38 0.52989 0.61072 0.0097090
## 20 0.00255265 41 0.52202 0.60987 0.0097048
## 21 0.00233993 42 0.51946 0.60200 0.0096655
## 22 0.00219811 45 0.51244 0.59689 0.0096395
## 23 0.00212721 48 0.50585 0.59519 0.0096308
## 24 0.00191449 52 0.49734 0.59264 0.0096176
## 25 0.00180813 59 0.48224 0.58668 0.0095866
## 26 0.00170177 61 0.47862 0.59094 0.0096088
## 27 0.00159541 62 0.47692 0.58030 0.0095529
## 28 0.00154223 69 0.46479 0.58030 0.0095529
## 29 0.00148904 73 0.45863 0.57903 0.0095461
## 30 0.00138268 75 0.45565 0.57924 0.0095472
## 31 0.00134723 82 0.44544 0.57371 0.0095175
## 32 0.00127632 85 0.44140 0.57456 0.0095221
## 33 0.00122314 89 0.43629 0.56882 0.0094908
## 34 0.00116996 99 0.42097 0.56882 0.0094908
## 35 0.00106360 105 0.41395 0.56414 0.0094650
## 36 0.00101042 119 0.39906 0.56265 0.0094568
## 37 0.00095724 124 0.39332 0.56371 0.0094627
## 38 0.00092179 142 0.37524 0.56520 0.0094709
## 39 0.00085088 150 0.36758 0.56520 0.0094709
## 40 0.00077998 183 0.33886 0.56286 0.0094579
## 41 0.00074452 193 0.33057 0.56265 0.0094568
## 42 0.00070907 207 0.32014 0.56265 0.0094568
## 43 0.00063816 210 0.31802 0.56137 0.0094497
rpart.plot(dtree_pruned, type = 2, extra = 101, fallen.leaves = F, tweak = 1.2, main = "Pruned Tree for eye status")
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
Here I created pruned decision tree on level 48. First I determined the best level for pruning and then I pruned the tree based on the level. We can see now that the final tree is simplier, prevents overfitting and performs better than the full tree.
pred_y_dtree_default <- predict(dtree_default, newdata = test_x, type = "class")
pred_y_dtree_full <- predict(dtree_full, newdata = test_x, type = "class")
pred_y_dtree_pruned <- predict(dtree_pruned, newdata = test_x, type = "class")
default_decision_tree_confusion_m <- confusionMatrix(pred_y_dtree_default, reference = test_y, positive = "2", mode = "prec_recall")
full_decision_tree_confusion_m <- confusionMatrix(pred_y_dtree_full,test_y, positive = "2", mode = "prec_recall")
pruned_decision_tree_confusion_m <- confusionMatrix(pred_y_dtree_pruned, test_y, positive = "2", mode = "prec_recall")
default_decision_tree_confusion_m
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2087 944
## 2 393 1070
##
## Accuracy : 0.7025
## 95% CI : (0.6889, 0.7158)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3826
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Precision : 0.7314
## Recall : 0.5313
## F1 : 0.6155
## Prevalence : 0.4482
## Detection Rate : 0.2381
## Detection Prevalence : 0.3255
## Balanced Accuracy : 0.6864
##
## 'Positive' Class : 2
##
full_decision_tree_confusion_m
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2060 490
## 2 420 1524
##
## Accuracy : 0.7975
## 95% CI : (0.7855, 0.8092)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5893
##
## Mcnemar's Test P-Value : 0.02218
##
## Precision : 0.7840
## Recall : 0.7567
## F1 : 0.7701
## Prevalence : 0.4482
## Detection Rate : 0.3391
## Detection Prevalence : 0.4326
## Balanced Accuracy : 0.7937
##
## 'Positive' Class : 2
##
pruned_decision_tree_confusion_m
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2062 473
## 2 418 1541
##
## Accuracy : 0.8017
## 95% CI : (0.7898, 0.8133)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5981
##
## Mcnemar's Test P-Value : 0.07044
##
## Precision : 0.7866
## Recall : 0.7651
## F1 : 0.7757
## Prevalence : 0.4482
## Detection Rate : 0.3429
## Detection Prevalence : 0.4359
## Balanced Accuracy : 0.7983
##
## 'Positive' Class : 2
##
I am predicting on all 3 decision tree classifiers I created with positive class “2” = closed eyelids. Based on confusion matrixes and calculations of Accuracy, Precision, Recall, and F1-Score we can see that the pruned decision tree classification model perfoms the best out of these three models. The accuracy is 80.62% what is not best but it can be satisfactory in some cases.The same goes with Precision, Recall and F1 value. I saved the best confusion matrix as a variable so I can create final table
svm_model<- svm(eyes_status ~ .,
data = train_data,
type = "C-classification",
kernel = "linear",
cost = 1,
scale = FALSE)
svm_model
##
## Call:
## svm(formula = eyes_status ~ ., data = train_data, type = "C-classification",
## kernel = "linear", cost = 1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 5843
svm_model$rho
## [1] -5783.164
I created Linear SVM classification model based on train data. We can see that it has really big amount of support vectors:5843 Then we can see the indexes of the support vectors, SV coordinates and negative eyes_status intercept of the decision boundary.
pred_train <- predict(svm_model, train_data)
mean(pred_train == train_data$eyes_status)
## [1] 0.6246774
pred_test <- predict(svm_model, test_data)
linear_svm_matrix <- confusionMatrix(data = pred_test, reference = test_y)
linear_svm_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 1928 1071
## 2 552 943
##
## Accuracy : 0.6389
## 95% CI : (0.6246, 0.6529)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2517
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7774
## Specificity : 0.4682
## Pos Pred Value : 0.6429
## Neg Pred Value : 0.6308
## Prevalence : 0.5518
## Detection Rate : 0.4290
## Detection Prevalence : 0.6673
## Balanced Accuracy : 0.6228
##
## 'Positive' Class : 1
##
We can see that the accuracy of this model is for the train set 62.46774% and for the test set 63.88518% what is bad and I wouldn’t use this model for predicting.
svm_model<- svm(eyes_status ~ .,
data = train_data,
type = "C-classification",
kernel = "polynomial",
cost = 1,
degree = 2,
scale = FALSE)
svm_model
##
## Call:
## svm(formula = eyes_status ~ ., data = train_data, type = "C-classification",
## kernel = "polynomial", cost = 1, degree = 2, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 5795
Here I train the model with polynomial kernel to see if there is difference in accuracy.
pred_train <- predict(svm_model, train_data)
mean(pred_train == train_data$eyes_status)
## [1] 0.636911
pred_test <- predict(svm_model, test_data)
poly_svm_matrix <- confusionMatrix(data = pred_test, reference = test_y)
poly_svm_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 1984 1028
## 2 496 986
##
## Accuracy : 0.6609
## 95% CI : (0.6468, 0.6747)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2969
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8000
## Specificity : 0.4896
## Pos Pred Value : 0.6587
## Neg Pred Value : 0.6653
## Prevalence : 0.5518
## Detection Rate : 0.4415
## Detection Prevalence : 0.6702
## Balanced Accuracy : 0.6448
##
## 'Positive' Class : 1
##
As the accuracy shows the polynomial kernel is little bit better than linear but it is still performing very bad.
svm_model<- svm(eyes_status ~ .,
data = train_data,
type = "C-classification",
kernel = "radial",
cost = 1,
degree = 2,
scale = FALSE)
svm_model
##
## Call:
## svm(formula = eyes_status ~ ., data = train_data, type = "C-classification",
## kernel = "radial", cost = 1, degree = 2, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 10463
pred_test <- predict(svm_model, test_data)
radial_svm_matrix <- confusionMatrix(data = pred_test, reference = test_y)
radial_svm_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2480 2014
## 2 0 0
##
## Accuracy : 0.5518
## 95% CI : (0.5372, 0.5665)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : 0.5062
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5518
## Neg Pred Value : NaN
## Prevalence : 0.5518
## Detection Rate : 0.5518
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
We can see that I have got 100% accuracy for train data but that is because we trained the radial model on train_data. If we look on test_data accuracy we see that this model performs horribly.
training_pred <- list()
Kselection <- seq(1, 85, 2)
for (i in Kselection) {
training_pred[[as.character(i)]] <- knn.cv(train = train_x,
cl = train_y,
k = i)
}
get_accuracy <- function(prediction, reference) {
all_levels <- union(levels(prediction), levels(reference))
prediction_factor <- factor(prediction, levels = all_levels)
reference_factor <- factor(reference, levels = all_levels)
confusion_matrix <- confusionMatrix(data = prediction_factor, reference = reference_factor)
accuracy <- confusion_matrix$overall["Accuracy"]
return(accuracy)
}
accuracies <- sapply(training_pred, get_accuracy, reference = train_y)
plot_ly(x = Kselection, y = accuracies, type = "scatter", mode = "line")
test_pred <- knn(train = train_x,
cl = train_y,
test = test_x,
k = 7)
knn_matrix <- confusionMatrix(data = test_pred, reference = test_y)
knn_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2311 227
## 2 169 1787
##
## Accuracy : 0.9119
## 95% CI : (0.9032, 0.92)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8214
##
## Mcnemar's Test P-Value : 0.004179
##
## Sensitivity : 0.9319
## Specificity : 0.8873
## Pos Pred Value : 0.9106
## Neg Pred Value : 0.9136
## Prevalence : 0.5518
## Detection Rate : 0.5142
## Detection Prevalence : 0.5648
## Balanced Accuracy : 0.9096
##
## 'Positive' Class : 1
##
For the knn I am finding the best number of neighbours for the algorithm and ploting the accuracies. As I found out 7 and 9 neighbours gives the best accuracy so I trained the model with 7 neighbours for best accuracy of 91.14%
train_y <- as.factor(train_y)
rf.orig <- ranger(x = train_x, y = train_y)
confusionMatrix(data = rf.orig$predictions, reference = train_y,
positive = "2", mode = "prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 5223 1060
## 2 539 3641
##
## Accuracy : 0.8472
## 95% CI : (0.8401, 0.854)
## No Information Rate : 0.5507
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.688
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Precision : 0.8711
## Recall : 0.7745
## F1 : 0.8200
## Prevalence : 0.4493
## Detection Rate : 0.3480
## Detection Prevalence : 0.3995
## Balanced Accuracy : 0.8405
##
## 'Positive' Class : 2
##
my_pred <- predict(object = rf.orig, data = test_x)
rf_matrix <- confusionMatrix(data = my_pred$predictions, reference = test_y, positive = "2", mode = "prec_recall")
rf_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2333 323
## 2 147 1691
##
## Accuracy : 0.8954
## 95% CI : (0.8861, 0.9042)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7868
##
## Mcnemar's Test P-Value : 6.907e-16
##
## Precision : 0.9200
## Recall : 0.8396
## F1 : 0.8780
## Prevalence : 0.4482
## Detection Rate : 0.3763
## Detection Prevalence : 0.4090
## Balanced Accuracy : 0.8902
##
## 'Positive' Class : 2
##
We can see good accuracy precision recall and f1 score on both train and test set. The model performs better on the test set and with accuracy 91.77% is the best model so far. We could also rebalance the data.
train_data$eyes_status <- as.factor(train_data$eyes_status)
ada_list<-c()
mfinal_values <- c(10, 50, 100, 200, 300, 400)
for (mfinal in mfinal_values) {
model <- adabag::boosting(eyes_status ~ ., data = train_data, boos = TRUE, mfinal = mfinal)
ada_list[[as.character(mfinal)]] <- model
}
accuracy_data <- data.frame(MFinal = numeric(), Accuracy = numeric())
for (i in seq_along(ada_list)) {
mfinal <- mfinal_values[i]
model <- ada_list[[as.character(mfinal)]]
predictions <- predict(model, newdata = test_data)
accuracy <- mean(predictions$class == test_data$eyes_status)
accuracy_data <- rbind(accuracy_data, data.frame(MFinal = mfinal, Accuracy = accuracy))
}
plot <- plot_ly(accuracy_data, type = "scatter", mode = "lines+markers", x = ~MFinal, y = ~Accuracy, name = "Accuracy")
plot
model <- ada_list[["300"]]
my_pred_adaboost <- predict(model, newdata = test_x)
adaboost_matrix <- confusionMatrix(as.factor(my_pred_adaboost$class), test_y, mode = "prec_recall", positive = "2")
adaboost_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2202 472
## 2 278 1542
##
## Accuracy : 0.8331
## 95% CI : (0.8219, 0.8439)
## No Information Rate : 0.5518
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6595
##
## Mcnemar's Test P-Value : 1.823e-12
##
## Precision : 0.8473
## Recall : 0.7656
## F1 : 0.8044
## Prevalence : 0.4482
## Detection Rate : 0.3431
## Detection Prevalence : 0.4050
## Balanced Accuracy : 0.8268
##
## 'Positive' Class : 2
##
I trained adaboost classifier with different mfinal values (by tuning the mfinal parameter) and I found that mfinal = 400 had the best performence out of these I tried with accuracy of 83.16%.
train_x_matrix <- as.matrix(train_x)
train_y_binary <- ifelse(train_y == 1, 0, 1)
xgb <- xgboost::xgboost(data = train_x_matrix,
label = train_y_binary,
nrounds=3000, objective = "binary:logistic")
xgbtest_x <- as.matrix(test_x)
mypred <- predict(xgb,newdata = xgbtest_x)
xgbtest_y <- ifelse(test_y == 1, 0, 1)
my_pred_xgboost <- as.integer(mypred>0.5)
xgb_matrix <- confusionMatrix(as.factor(my_pred_xgboost),as.factor(xgbtest_y), mode="prec_recall", positive = "1")
xgb_matrix
For the xg boost model I needed to change the test_x and train_x data frames to matrix and the test_y and train_y to binary so the algorithm works. I ran the model with different amount of rounds and 3000 seemed to me that works the best. It has got really good accuracy of 91.34%
method_algos <- c("Default_tree", "Full_tree", "Pruned_tree", "Linear_svm", "Polynomial_svm", "Radial_svm", "KNN", "Random_Forest", "Adaboost", "XGboost")
accuracy_of_all <- c(default_decision_tree_confusion_m$overall["Accuracy"], full_decision_tree_confusion_m$overall["Accuracy"], pruned_decision_tree_confusion_m$overall["Accuracy"], linear_svm_matrix$overall["Accuracy"], poly_svm_matrix$overall["Accuracy"], radial_svm_matrix$overall["Accuracy"], knn_matrix$overall["Accuracy"], rf_matrix$overall["Accuracy"], adaboost_matrix$overall["Accuracy"], xgb_matrix$overall["Accuracy"])
precision_of_all <- c(default_decision_tree_confusion_m$byClass["Precision"], full_decision_tree_confusion_m$byClass["Precision"], pruned_decision_tree_confusion_m$byClass["Precision"], linear_svm_matrix$byClass["Precision"], poly_svm_matrix$byClass["Precision"], radial_svm_matrix$byClass["Precision"], knn_matrix$byClass["Precision"], rf_matrix$byClass["Precision"], adaboost_matrix$byClass["Precision"], xgb_matrix$byClass["Precision"])
recall_of_all <- c(default_decision_tree_confusion_m$byClass["Recall"], full_decision_tree_confusion_m$byClass["Recall"], pruned_decision_tree_confusion_m$byClass["Recall"], linear_svm_matrix$byClass["Recall"], poly_svm_matrix$byClass["Recall"], radial_svm_matrix$byClass["Recall"], knn_matrix$byClass["Recall"], rf_matrix$byClass["Recall"], adaboost_matrix$byClass["Recall"], xgb_matrix$byClass["Recall"])
f1_of_all <- c(default_decision_tree_confusion_m$byClass["F1"], full_decision_tree_confusion_m$byClass["F1"], pruned_decision_tree_confusion_m$byClass["F1"], linear_svm_matrix$byClass["F1"], poly_svm_matrix$byClass["F1"], radial_svm_matrix$byClass["F1"], knn_matrix$byClass["F1"], rf_matrix$byClass["F1"], adaboost_matrix$byClass["F1"], xgb_matrix$byClass["F1"])
summary_df <- data.frame(Method = method_algos,Accuracy = accuracy_of_all, Precision = precision_of_all, Recall = recall_of_all, F1_score = f1_of_all)
paged_df <- paged_table(summary_df)
paged_df
plot_ly(summary_df, type = "scatter", mode = "lines+markers", y = ~Accuracy, x = seq(1, nrow(summary_df)), name = "Accuracy") |>
add_trace(y = ~Precision, x = seq(1, nrow(summary_df)), name = "Precision") |>
add_trace(y = ~Recall, x = seq(1, nrow(summary_df)), name = "Recall") |>
add_trace(y = ~F1_score, x = seq(1, nrow(summary_df)), name = "F1_score")|>
layout(xaxis = list(tickmode = "array", tickvals = seq(1, nrow(summary_df)), ticktext = summary_df$Method),
title = "Summary Plot of precisions for all approaches", yaxis = list(title = 'Values'))
I created a plot comparing all the approaches that I trained my data for. We can see that the best performing model is knn with all the measures above 90 percent. The worst performing approaches for this type of classification are all 3 types of support vector machines.